P.C.A.

In [2]:
#Importing the libraries to watch the 'fits' image and get the data array
import astropy
#import plotly.graph_objects as go
from astropy.io import fits
#Importing a library that is useful to read the original file
import pandas as pd
import pylab as plb
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from scipy import asarray as ar,exp
#Importing a visual library with some illustrative set up
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors 
from matplotlib import cm
import numpy as np
from sklearn.utils.testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.decomposition import PCA
import math 
import seaborn as sns
from sklearn.linear_model import LogisticRegression
plt.style.use('fivethirtyeight')
plt.rcParams['font.family'] = 'sans-serif' 
plt.rcParams['font.serif'] = 'Ubuntu' 
plt.rcParams['font.monospace'] = 'Ubuntu Mono' 
plt.rcParams['font.size'] = 14 
plt.rcParams['axes.labelsize'] = 12 
plt.rcParams['axes.labelweight'] = 'bold' 
plt.rcParams['axes.titlesize'] = 12 
plt.rcParams['xtick.labelsize'] = 12 
plt.rcParams['ytick.labelsize'] = 12 
#plt.rcParams['legend.fontsize'] = 12 
plt.rcParams['figure.titlesize'] = 12 
plt.rcParams['image.cmap'] = 'jet' 
plt.rcParams['image.interpolation'] = 'none' 
plt.rcParams['figure.figsize'] = (16, 8) 
plt.rcParams['lines.linewidth'] = 2 
plt.rcParams['lines.markersize'] = 8
plt.rcParams["axes.grid"] = False
In [3]:
data=pd.read_csv('star.txt',sep='\s+')
In [4]:
x=np.array(data.F606W-data.F814W)
y=np.array(data.F606W)
In [4]:
data.head()
Out[4]:
#ID X Y F606W error F814W error.1 Chi Sharp
0 8 4462.947 140.859 28.197 0.1036 27.127 0.1068 6.106 0.7310
1 120 5002.486 186.138 20.843 0.0552 19.815 0.0661 10.690 0.4000
2 165 5003.574 195.421 27.014 0.2194 26.068 0.1349 10.651 0.1940
3 226 4448.493 207.417 30.193 0.3641 28.270 0.1636 4.059 -1.9585
4 351 5040.717 224.366 28.935 0.1669 28.941 0.2464 4.274 -0.7720
In [5]:
notar=data.drop(columns=['Sharp','#ID'])
In [6]:
pca=PCA(n_components=3)
pca=pca.fit(notar)
pca_data=pd.DataFrame(pca.transform(notar))
In [7]:
pca_data=pca_data.rename(columns={0:'FirstComponent',1:'SecondComponent',2:'ThirdComponent'})
In [12]:
pca_data.tail()
Out[12]:
FirstComponent SecondComponent ThirdComponent
51475 -2595.533534 4352.168882 -2.548815
51476 -2608.235077 4339.799052 2.189038
51477 -2706.797907 4214.351512 -3.699351
51478 -3666.679220 2983.957433 -4.293191
51479 -3977.662877 2587.956795 -0.506620
In [13]:
notar.corr()
Out[13]:
X Y F606W error F814W error.1 Chi
X 1.000000 0.053451 -0.096873 -0.081905 -0.103088 -0.075036 0.036013
Y 0.053451 1.000000 -0.154213 -0.112309 -0.162024 -0.103044 0.043035
F606W -0.096873 -0.154213 1.000000 0.560325 0.981704 0.516773 0.041261
error -0.081905 -0.112309 0.560325 1.000000 0.565141 0.800688 0.249743
F814W -0.103088 -0.162024 0.981704 0.565141 1.000000 0.572660 0.044171
error.1 -0.075036 -0.103044 0.516773 0.800688 0.572660 1.000000 0.276366
Chi 0.036013 0.043035 0.041261 0.249743 0.044171 0.276366 1.000000
In [14]:
pca_data.corr()
Out[14]:
FirstComponent SecondComponent ThirdComponent
FirstComponent 1.000000e+00 3.600011e-16 8.911462e-17
SecondComponent 3.600011e-16 1.000000e+00 2.532462e-16
ThirdComponent 8.911462e-17 2.532462e-16 1.000000e+00
In [15]:
COL_NAMES=pca_data.columns.tolist()
In [16]:
k=1
for i in range(3):
    col=COL_NAMES[i]
    for j in range(3):
        if k==1:
            plt.subplot(3,3,k)
            plt.subplots_adjust(left=0.025, bottom=0.1, right=0.9, top=1.5, wspace=0.2, hspace=0.7)

            plt.plot(pca_data[col],pca_data[COL_NAMES[j]],color='k')
            plt.xlabel(COL_NAMES[i])
            plt.ylabel(COL_NAMES[j])
        else:
            plt.subplot(3,3,k)
            plt.subplots_adjust(left=0.025, bottom=0.1, right=0.9, top=1.5, wspace=0.2, hspace=0.7)

            plt.plot(pca_data[col],pca_data[COL_NAMES[j]],',',color='k')
            plt.xlabel(COL_NAMES[i])
            plt.ylabel(COL_NAMES[j])

        k=k+1
In [17]:
 from sklearn.feature_selection import mutual_info_regression as mi
In [16]:
COL_NAMES=data.columns.tolist()
In [17]:
MI=[]
for col in COL_NAMES[1:-1]:
    MI.append(mi(np.array(data[col].tolist()).reshape(-1,1),np.array(data.Sharp)))
    print ('Mutual information computed between '+ str(col)+ ' and Sharp')
Mutual information computed between X and Sharp
Mutual information computed between Y and Sharp
Mutual information computed between F606W and Sharp
Mutual information computed between error and Sharp
Mutual information computed between F814W and Sharp
Mutual information computed between error.1 and Sharp
Mutual information computed between Chi and Sharp
In [19]:
NEW_MI=[]
for i in range(len(MI)):
    NEW_MI.append(MI[i][0])
In [20]:
MI_data=pd.DataFrame({'Column':COL_NAMES[1:-1],'Mutual Information':NEW_MI})
In [21]:
MI_data.sort_values(by='Mutual Information',ascending=False).head(3)
Out[21]:
Column Mutual Information
3 error 0.563426
5 error.1 0.554651
4 F814W 0.468174
In [22]:
PCA_COL_NAMES=pca_data.columns.tolist()
In [23]:
MI=[]
for col in PCA_COL_NAMES:
    MI.append(mi(np.array(pca_data[col].tolist()).reshape(-1,1),np.array(data.Sharp)))
    print ('Mutual information computed between '+ str(col)+ ' and Sharp')
Mutual information computed between FirstComponent and Sharp
Mutual information computed between SecondComponent and Sharp
Mutual information computed between ThirdComponent and Sharp
In [24]:
PCA_MI=[]
for i in range(len(MI)):
    PCA_MI.append(MI[i][0])
In [25]:
PCA_MI_data=pd.DataFrame({'Column':PCA_COL_NAMES,'Mutual Information':PCA_MI})
In [26]:
PCA_MI_data
Out[26]:
Column Mutual Information
0 FirstComponent 0.062576
1 SecondComponent 0.051429
2 ThirdComponent 0.414604

P.C.A. Excluding Space

In [6]:
notar=data.drop(columns=['Sharp','#ID','X','Y'])
In [9]:
notar.head()
Out[9]:
F606W error F814W error.1 Chi
0 28.197 0.1036 27.127 0.1068 6.106
1 20.843 0.0552 19.815 0.0661 10.690
2 27.014 0.2194 26.068 0.1349 10.651
3 30.193 0.3641 28.270 0.1636 4.059
4 28.935 0.1669 28.941 0.2464 4.274
In [7]:
pca=PCA(n_components=3)
pca=pca.fit(notar)
pca_data=pd.DataFrame(pca.transform(notar))
In [8]:
pca_data=pca_data.rename(columns={0:'FirstComponent',1:'SecondComponent',2:'ThirdComponent'})
In [22]:
COL_NAMES=pca_data.columns.tolist()
In [19]:
k=1
q=0
for i in range(3):
    col=COL_NAMES[i]
    for j in range(3):
        if k==1 or k==5 or k==9:
            plt.subplot(3,3,k)
            plt.subplots_adjust(left=0.025, bottom=0.1, right=0.9, top=1.5, wspace=0.2, hspace=0.7)

            sns.kdeplot(pca_data[COL_NAMES[i]],color='darkorange')
            #g._legend.remove()
            plt.grid(True)
            plt.xlabel('Values')
            plt.legend([],[], frameon=False)

            plt.xlabel(COL_NAMES[i])
            plt.ylabel('Distribution')
            #plt.ylabel(COL_NAMES[j])
        else:
            plt.subplot(3,3,k)
            plt.subplots_adjust(left=0.025, bottom=0.1, right=0.9, top=1.5, wspace=0.2, hspace=0.7)

            plt.plot(pca_data[col],pca_data[COL_NAMES[j]],',',color='k')
            plt.xlabel(COL_NAMES[i])
            plt.ylabel(COL_NAMES[j])
            plt.grid(True)

        k=k+1
In [20]:
plt.subplot(1,2,1)
plt.plot(pca_data['ThirdComponent'],pca_data['FirstComponent'],',',color='gold')
plt.xlabel('PCA Third Component ')
plt.ylabel('PCA First Component')
plt.grid(True)
plt.subplot(1,2,2)
plt.ylim(30.5,12.5)
plt.ylabel('814 nm Flux')
plt.xlabel('Stellar Color')
plt.plot(np.array(data['F814W']-data['F606W']),data.F814W,',',color='gold')
plt.grid(True)
In [21]:
plt.plot(-pca_data.FirstComponent,data.F814W,',',color='k')
plt.plot((data.F606W-26.5)*1.3+5*data.error,data.F814W,',',color='red')
##plt.plot((data.F606W-26.2)*1.3-data.error,data.F814W,',',color='purple')

#plt.plot(-pca_data.FirstComponent,data.F606W,',',color='red')
plt.grid(True)
In [22]:
from sklearn.metrics import mean_squared_error
In [37]:
A=np.arange(1,3,0.1)
B=np.arange(-30.5,-22.5,0.5)
C=np.arange(-15,15,1)
orig=-pca_data.FirstComponent
max_pca=np.abs(-pca_data.FirstComponent.max())
RMSE=[]
TRIPLET=[]
for a in A:
    
    for b in B:
        for c in C:
            recons=(data.F606W+b)*a+c*data.error
            RMSE.append(np.sqrt(mean_squared_error(recons,orig)))
            TRIPLET.append([a,b,c])
In [38]:
#np.array(RMSE).argmin()
a_opt=TRIPLET[np.array(RMSE).argmin()][0]
b_opt=TRIPLET[np.array(RMSE).argmin()][1]
c_opt=TRIPLET[np.array(RMSE).argmin()][2]
r_opt=(data.F606W+b_opt)*a_opt+c_opt*data.error
In [39]:
D=np.arange(-10,10,0.1)
BEST_RMSE=[]
for d in D:
    recons=r_opt+d*data.F814W
    BEST_RMSE.append(np.sqrt(mean_squared_error(recons,orig)))
In [40]:
BEST_RMSE=np.array(BEST_RMSE)
In [42]:
d_opt=D[BEST_RMSE.argmin()]
In [43]:
r_opt=(data.F606W+b_opt)*a_opt+c_opt*data.error+d_opt*data.F814W
In [44]:
E=np.arange(-10,10,1)
BEST_RMSE=[]
for e in E:
    recons=r_opt+e*data['error.1']
    BEST_RMSE.append(np.sqrt(mean_squared_error(recons,orig)))
In [45]:
BEST_RMSE=np.array(BEST_RMSE)
In [46]:
e_opt=E[np.array(BEST_RMSE).argmin()]
In [47]:
r_opt_first=r_opt+e_opt*data['error.1']
In [48]:
first_RMSE=np.array(BEST_RMSE).min()/max_pca
In [49]:
plt.plot(-pca_data.FirstComponent,data.F814W,',',color='k')
plt.plot(r_opt,data.F814W,',',color='red')
plt.grid(True)
In [50]:
#plt.plot(np.array(data.Chi),np.array(data.F814W-data.F606W),',')
plt.plot(np.array(data.Chi),pca_data['FirstComponent'],',',color='red')
plt.plot(pca_data.SecondComponent*0.96+2.276,pca_data['FirstComponent'],',',color='black')
Out[50]:
[<matplotlib.lines.Line2D at 0x7fa913acfe20>]
In [51]:
r_opt_sec=pca_data.SecondComponent*0.96+2.276
In [52]:
second_RMSE=np.sqrt(mean_squared_error(pca_data.SecondComponent*0.96+2.276,data.Chi))/data.Chi.max()
In [53]:
plt.ylim(30.0,12.5)
plt.plot(np.array(data['F814W']-data['F606W']),data.F814W,',',color='red')
plt.plot(-1.24+1.31*pca_data.ThirdComponent,data.F814W,',',color='k')
Out[53]:
[<matplotlib.lines.Line2D at 0x7fa914841580>]
In [54]:
r_opt_third=-1.24+1.31*pca_data.ThirdComponent
In [55]:
third_RMSE=np.sqrt(mean_squared_error(-1.24+1.31*pca_data.ThirdComponent,np.array(data.F814W-data.F606W)))
In [56]:
third_RMSE=third_RMSE/np.array(data['F814W']-data['F606W']).max()
In [57]:
pca_data['X']=data.X
pca_data['Y']=data.Y
In [58]:
pca_data['Sharp']=data.Sharp
In [59]:
pca_data.head()
Out[59]:
FirstComponent SecondComponent ThirdComponent X Y Sharp
0 -3.348246 3.558359 0.052565 4462.947 140.859 0.7310
1 6.885136 8.439184 0.189286 5002.486 186.138 0.4000
2 -1.896787 8.148357 0.145220 5003.574 195.421 0.1940
3 -5.504066 1.449171 -0.564914 4448.493 207.417 -1.9585
4 -5.110070 1.678896 0.804773 5040.717 224.366 -0.7720
In [60]:
#pca_data=pca_data.drop(columns=['Sharp'])
In [61]:
PCA_COL_NAMES=pca_data.columns.tolist()
In [63]:
MI=[]
for col in PCA_COL_NAMES[0:-1]:
    MI.append(mi(np.array(pca_data[col].tolist()).reshape(-1,1),np.array(data.Sharp)))
    print ('Mutual information computed between '+ str(col)+ ' and Sharp')
Mutual information computed between FirstComponent and Sharp
Mutual information computed between SecondComponent and Sharp
Mutual information computed between ThirdComponent and Sharp
Mutual information computed between X and Sharp
Mutual information computed between Y and Sharp
In [64]:
NEW_MI=[]
for i in range(len(MI)):
    NEW_MI.append(MI[i][0])
In [65]:
MI_data=pd.DataFrame({'Column':PCA_COL_NAMES[0:-1],'Mutual Information':NEW_MI})
In [66]:
MI_data.sort_values(by='Mutual Information', ascending=False)
Out[66]:
Column Mutual Information
0 FirstComponent 0.481823
1 SecondComponent 0.321916
2 ThirdComponent 0.132833
4 Y 0.073511
3 X 0.045362

2-class: Positive or Negative

1. PCA Linear Classifier

In [9]:
data['SharpSign']=data.Sharp.apply(np.sign)
data['SharpSign']=data['SharpSign'].replace(0,1)
In [10]:
pca_data['Target']=data['SharpSign']
In [303]:
pca_data.head()
Out[303]:
FirstComponent SecondComponent ThirdComponent X Y Sharp Target
0 -3.348246 3.558359 0.052565 4462.947 140.859 0.7310 1.0
1 6.885136 8.439184 0.189286 5002.486 186.138 0.4000 1.0
2 -1.896787 8.148357 0.145220 5003.574 195.421 0.1940 1.0
3 -5.504066 1.449171 -0.564914 4448.493 207.417 -1.9585 -1.0
4 -5.110070 1.678896 0.804773 5040.717 224.366 -0.7720 -1.0
In [11]:
from sklearn.svm import LinearSVC as SVC
from sklearn.model_selection import train_test_split
In [325]:
X=pca_data.drop(columns=['Sharp','Target'])
In [326]:
y=pca_data.Target
In [327]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)
In [328]:
X_traint,X_train_val,y_traint,y_train_val= train_test_split(
    X_train, y_train, test_size=0.2, random_state=42)
In [345]:
C_SCORE=[]
c=np.arange(0.2,1.2,0.2)
for c_value in c:
    clf=SVC(C=c_value)
    clf.fit(X_traint,y_traint)
    sc=clf.score(X_train_val,y_train_val)
    C_SCORE.append(sc)
    print(str(c_value) + ' coefficient has been adopted' )
0.2 coefficient has been adopted
0.4 coefficient has been adopted
0.6000000000000001 coefficient has been adopted
0.8 coefficient has been adopted
1.0 coefficient has been adopted
In [346]:
MIN_VAL=np.array(C_SCORE).max()
MIN_C=c[np.array(C_SCORE).argmax()]
print ('In the range ' + str(c.min())+ ' and ' +str(c.max())+ '\n')
print ('the best score has been obtained with ' + str(MIN_C))
print ('and it is ' + str(MIN_VAL))
In the range 0.2 and 1.0

the best score has been obtained with 0.6000000000000001
and it is 0.6128444822143985
In [347]:
C_SCORE=[]
c=np.arange(1,11,1)
for c_value in c:
    clf=SVC(C=c_value)
    clf.fit(X_traint,y_traint)
    sc=clf.score(X_train_val,y_train_val)
    C_SCORE.append(sc)
    print(str(c_value) + ' coefficient has been adopted' )
1 coefficient has been adopted
2 coefficient has been adopted
3 coefficient has been adopted
4 coefficient has been adopted
5 coefficient has been adopted
6 coefficient has been adopted
7 coefficient has been adopted
8 coefficient has been adopted
9 coefficient has been adopted
10 coefficient has been adopted
In [349]:
MIN_VAL=np.array(C_SCORE).max()
MIN_C=c[np.array(C_SCORE).argmax()]
print ('In the range ' + str(c.min())+ ' and ' +str(c.max())+ '\n')
print ('the best score has been obtained with ' + str(MIN_C))
print ('and it is ' + str(MIN_VAL))
In the range 1 and 10

the best score has been obtained with 7
and it is 0.6253490348427826
In [343]:
C_SCORE=[]
c=np.arange(10,110,10)
for c_value in c:
    clf=SVC(C=c_value)
    clf.fit(X_traint,y_traint)
    sc=clf.score(X_train_val,y_train_val)
    C_SCORE.append(sc)
    print(str(c_value) + ' coefficient has been adopted' )
10 coefficient has been adopted
20 coefficient has been adopted
30 coefficient has been adopted
40 coefficient has been adopted
50 coefficient has been adopted
60 coefficient has been adopted
70 coefficient has been adopted
80 coefficient has been adopted
90 coefficient has been adopted
100 coefficient has been adopted
In [344]:
MIN_VAL=np.array(C_SCORE).max()
MIN_C=c[np.array(C_SCORE).argmax()]
print ('In the range ' + str(c.min())+ ' and ' +str(c.max())+ '\n')
print ('the best score has been obtained with ' + str(MIN_C))
print ('and it is ' + str(MIN_VAL))
In the range 10 and 100

the best score has been obtained with 30
and it is 0.6247420177248998
In [386]:
OPT_C=[0.6,7,30]
In [388]:
FIN_SCORE=[]
for opt_C in OPT_C:
    clf=SVC(C=opt_c)
    clf.fit(X_train,y_train)
    fin_score=clf.score(X_test,y_test)
    FIN_SCORE.append(fin_score)
In [391]:
FIN_SCORE=np.array(FIN_SCORE)
In [392]:
fin_score=FIN_SCORE.max()
In [394]:
print ('The PCA dataset gave a best classification with ' + str(fin_score*100)+ '% of accuracy with a linear classifier')
The PCA dataset gave a best classification with 62.849650349650354% of accuracy with a linear classifier

1.2 Dataset Linear Classifier

In [413]:
X=data.drop(columns=['SharpSign'])
In [414]:
X=pca_data.drop(columns=['Sharp','Target'])
In [415]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)
In [416]:
X_traint,X_train_val,y_traint,y_train_val= train_test_split(
    X_train, y_train, test_size=0.2, random_state=42)
In [417]:
C_SCORE=[]
c=np.arange(0.2,1.2,0.2)
for c_value in c:
    clf=SVC(C=c_value)
    clf.fit(X_traint,y_traint)
    sc=clf.score(X_train_val,y_train_val)
    C_SCORE.append(sc)
    print(str(c_value) + ' coefficient has been adopted' )
0.2 coefficient has been adopted
0.4 coefficient has been adopted
0.6000000000000001 coefficient has been adopted
0.8 coefficient has been adopted
1.0 coefficient has been adopted
In [418]:
MIN_VAL=np.array(C_SCORE).max()
MIN_C=c[np.array(C_SCORE).argmax()]
print ('In the range ' + str(c.min())+ ' and ' +str(c.max())+ '\n')
print ('the best score has been obtained with ' + str(MIN_C))
print ('and it is ' + str(MIN_VAL))
In the range 0.2 and 1.0

the best score has been obtained with 1.0
and it is 0.6195216705111084
In [419]:
C_SCORE=[]
c=np.arange(1,11,1)
for c_value in c:
    clf=SVC(C=c_value)
    clf.fit(X_traint,y_traint)
    sc=clf.score(X_train_val,y_train_val)
    C_SCORE.append(sc)
    print(str(c_value) + ' coefficient has been adopted' )
1 coefficient has been adopted
2 coefficient has been adopted
3 coefficient has been adopted
4 coefficient has been adopted
5 coefficient has been adopted
6 coefficient has been adopted
7 coefficient has been adopted
8 coefficient has been adopted
9 coefficient has been adopted
10 coefficient has been adopted
In [420]:
MIN_VAL=np.array(C_SCORE).max()
MIN_C=c[np.array(C_SCORE).argmax()]
print ('In the range ' + str(c.min())+ ' and ' +str(c.max())+ '\n')
print ('the best score has been obtained with ' + str(MIN_C))
print ('and it is ' + str(MIN_VAL))
In the range 1 and 10

the best score has been obtained with 9
and it is 0.6219497389826393
In [421]:
C_SCORE=[]
c=np.arange(10,110,10)
for c_value in c:
    clf=SVC(C=c_value)
    clf.fit(X_traint,y_traint)
    sc=clf.score(X_train_val,y_train_val)
    C_SCORE.append(sc)
    print(str(c_value) + ' coefficient has been adopted' )
10 coefficient has been adopted
20 coefficient has been adopted
30 coefficient has been adopted
40 coefficient has been adopted
50 coefficient has been adopted
60 coefficient has been adopted
70 coefficient has been adopted
80 coefficient has been adopted
90 coefficient has been adopted
100 coefficient has been adopted
In [422]:
MIN_VAL=np.array(C_SCORE).max()
MIN_C=c[np.array(C_SCORE).argmax()]
print ('In the range ' + str(c.min())+ ' and ' +str(c.max())+ '\n')
print ('the best score has been obtained with ' + str(MIN_C))
print ('and it is ' + str(MIN_VAL))
In the range 10 and 100

the best score has been obtained with 70
and it is 0.6226781595240986
In [423]:
OPT_C=[1,70]
In [424]:
FIN_SCORE=[]
for opt_C in OPT_C:
    clf=SVC(C=opt_c)
    clf.fit(X_train,y_train)
    fin_score=clf.score(X_test,y_test)
    FIN_SCORE.append(fin_score)
In [425]:
FIN_SCORE=np.array(FIN_SCORE)
In [426]:
fin_score=FIN_SCORE.max()
In [427]:
print ('The PCA dataset gave a best classification with ' + str(fin_score*100)+ '% of accuracy with a linear classifier')
The PCA dataset gave a best classification with 61.34421134421134% of accuracy with a linear classifier

1.3 Best Features Classification Original Data

In [433]:
opt_data=data.drop(columns=['#ID','X','Y','Chi','Sharp','F606W'])
In [434]:
opt_data.head()
Out[434]:
error F814W error.1 SharpSign
0 0.1036 27.127 0.1068 1.0
1 0.0552 19.815 0.0661 1.0
2 0.2194 26.068 0.1349 1.0
3 0.3641 28.270 0.1636 -1.0
4 0.1669 28.941 0.2464 -1.0
In [436]:
X=opt_data.drop(columns=['SharpSign'])
In [437]:
data['SharpSign']=data.Sharp.apply(np.sign)
data['SharpSign']=data['SharpSign'].replace(0,1)
y=data.SharpSign
In [438]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)
In [439]:
X_traint,X_train_val,y_traint,y_train_val= train_test_split(
    X_train, y_train, test_size=0.2, random_state=42)
In [440]:
C_SCORE=[]
c=np.arange(0.2,1.2,0.2)
for c_value in c:
    clf=SVC(C=c_value)
    clf.fit(X_traint,y_traint)
    sc=clf.score(X_train_val,y_train_val)
    C_SCORE.append(sc)
    print(str(c_value) + ' coefficient has been adopted' )
0.2 coefficient has been adopted
0.4 coefficient has been adopted
0.6000000000000001 coefficient has been adopted
0.8 coefficient has been adopted
1.0 coefficient has been adopted
In [441]:
MIN_VAL=np.array(C_SCORE).max()
MIN_C=c[np.array(C_SCORE).argmax()]
print ('In the range ' + str(c.min())+ ' and ' +str(c.max())+ '\n')
print ('the best score has been obtained with ' + str(MIN_C))
print ('and it is ' + str(MIN_VAL))
In the range 0.2 and 1.0

the best score has been obtained with 0.2
and it is 0.6895714459147748
In [442]:
C_SCORE=[]
c=np.arange(1,11,1)
for c_value in c:
    clf=SVC(C=c_value)
    clf.fit(X_traint,y_traint)
    sc=clf.score(X_train_val,y_train_val)
    C_SCORE.append(sc)
    print(str(c_value) + ' coefficient has been adopted' )
1 coefficient has been adopted
2 coefficient has been adopted
3 coefficient has been adopted
4 coefficient has been adopted
5 coefficient has been adopted
6 coefficient has been adopted
7 coefficient has been adopted
8 coefficient has been adopted
9 coefficient has been adopted
10 coefficient has been adopted
In [443]:
MIN_VAL=np.array(C_SCORE).max()
MIN_C=c[np.array(C_SCORE).argmax()]
print ('In the range ' + str(c.min())+ ' and ' +str(c.max())+ '\n')
print ('the best score has been obtained with ' + str(MIN_C))
print ('and it is ' + str(MIN_VAL))
In the range 1 and 10

the best score has been obtained with 1
and it is 0.6692970741774918
In [444]:
C_SCORE=[]
c=np.arange(10,110,10)
for c_value in c:
    clf=SVC(C=c_value)
    clf.fit(X_traint,y_traint)
    sc=clf.score(X_train_val,y_train_val)
    C_SCORE.append(sc)
    print(str(c_value) + ' coefficient has been adopted' )
10 coefficient has been adopted
20 coefficient has been adopted
30 coefficient has been adopted
40 coefficient has been adopted
50 coefficient has been adopted
60 coefficient has been adopted
70 coefficient has been adopted
80 coefficient has been adopted
90 coefficient has been adopted
100 coefficient has been adopted
In [445]:
MIN_VAL=np.array(C_SCORE).max()
MIN_C=c[np.array(C_SCORE).argmax()]
print ('In the range ' + str(c.min())+ ' and ' +str(c.max())+ '\n')
print ('the best score has been obtained with ' + str(MIN_C))
print ('and it is ' + str(MIN_VAL))
In the range 10 and 100

the best score has been obtained with 40
and it is 0.6248634211484764
In [447]:
OPT_C=[0.2,1,40]
In [448]:
FIN_SCORE=[]
for opt_C in OPT_C:
    clf=SVC(C=opt_c)
    clf.fit(X_train,y_train)
    fin_score=clf.score(X_test,y_test)
    FIN_SCORE.append(fin_score)
In [449]:
FIN_SCORE=np.array(FIN_SCORE)
In [450]:
fin_score=FIN_SCORE.max()
In [451]:
print ('The PCA dataset gave a best classification with ' + str(fin_score*100)+ '% of accuracy with a linear classifier')
The PCA dataset gave a best classification with 62.771950271950274% of accuracy with a linear classifier

1.4 Best Feature Classification PCA data

In [452]:
opt_data=pca_data[['FirstComponent','SecondComponent','ThirdComponent']]
In [453]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)
In [454]:
X_traint,X_train_val,y_traint,y_train_val= train_test_split(
    X_train, y_train, test_size=0.2, random_state=42)
In [455]:
C_SCORE=[]
c=np.arange(0.2,1.2,0.2)
for c_value in c:
    clf=SVC(C=c_value)
    clf.fit(X_traint,y_traint)
    sc=clf.score(X_train_val,y_train_val)
    C_SCORE.append(sc)
    print(str(c_value) + ' coefficient has been adopted' )
0.2 coefficient has been adopted
0.4 coefficient has been adopted
0.6000000000000001 coefficient has been adopted
0.8 coefficient has been adopted
1.0 coefficient has been adopted
In [456]:
MIN_VAL=np.array(C_SCORE).max()
MIN_C=c[np.array(C_SCORE).argmax()]
print ('In the range ' + str(c.min())+ ' and ' +str(c.max())+ '\n')
print ('the best score has been obtained with ' + str(MIN_C))
print ('and it is ' + str(MIN_VAL))
In the range 0.2 and 1.0

the best score has been obtained with 0.2
and it is 0.6904212698798106
In [457]:
C_SCORE=[]
c=np.arange(1,11,1)
for c_value in c:
    clf=SVC(C=c_value)
    clf.fit(X_traint,y_traint)
    sc=clf.score(X_train_val,y_train_val)
    C_SCORE.append(sc)
    print(str(c_value) + ' coefficient has been adopted' )
1 coefficient has been adopted
2 coefficient has been adopted
3 coefficient has been adopted
4 coefficient has been adopted
5 coefficient has been adopted
6 coefficient has been adopted
7 coefficient has been adopted
8 coefficient has been adopted
9 coefficient has been adopted
10 coefficient has been adopted
In [458]:
MIN_VAL=np.array(C_SCORE).max()
MIN_C=c[np.array(C_SCORE).argmax()]
print ('In the range ' + str(c.min())+ ' and ' +str(c.max())+ '\n')
print ('the best score has been obtained with ' + str(MIN_C))
print ('and it is ' + str(MIN_VAL))
In the range 1 and 10

the best score has been obtained with 1
and it is 0.6713609323782931
In [459]:
C_SCORE=[]
c=np.arange(10,110,10)
for c_value in c:
    clf=SVC(C=c_value)
    clf.fit(X_traint,y_traint)
    sc=clf.score(X_train_val,y_train_val)
    C_SCORE.append(sc)
    print(str(c_value) + ' coefficient has been adopted' )
10 coefficient has been adopted
20 coefficient has been adopted
30 coefficient has been adopted
40 coefficient has been adopted
50 coefficient has been adopted
60 coefficient has been adopted
70 coefficient has been adopted
80 coefficient has been adopted
90 coefficient has been adopted
100 coefficient has been adopted
In [460]:
MIN_VAL=np.array(C_SCORE).max()
MIN_C=c[np.array(C_SCORE).argmax()]
print ('In the range ' + str(c.min())+ ' and ' +str(c.max())+ '\n')
print ('the best score has been obtained with ' + str(MIN_C))
print ('and it is ' + str(MIN_VAL))
In the range 10 and 100

the best score has been obtained with 70
and it is 0.6231637732184048
In [461]:
OPT_C=[0.2,1,70]
In [462]:
FIN_SCORE=[]
for opt_C in OPT_C:
    clf=SVC(C=opt_c)
    clf.fit(X_train,y_train)
    fin_score=clf.score(X_test,y_test)
    FIN_SCORE.append(fin_score)
In [463]:
FIN_SCORE=np.array(FIN_SCORE)
In [464]:
fin_score=FIN_SCORE.max()
In [465]:
print ('The PCA dataset gave a best classification with ' + str(fin_score*100)+ '% of accuracy with a linear classifier')
The PCA dataset gave a best classification with 63.238150738150736% of accuracy with a linear classifier

2. Best method non linear

In [72]:
opt_data=pca_data[['FirstComponent','SecondComponent']]
In [85]:
opt_data['Target']=data.SharpSign
<ipython-input-85-60262cea00d2>:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  opt_data['Target']=data.SharpSign
In [86]:
opt_data
Out[86]:
FirstComponent SecondComponent Target
0 -3.348246 3.558359 1.0
1 6.885136 8.439184 1.0
2 -1.896787 8.148357 1.0
3 -5.504066 1.449171 -1.0
4 -5.110070 1.678896 -1.0
... ... ... ...
51475 -1.445934 7.015818 1.0
51476 3.290770 6.468575 -1.0
51477 -2.608413 1.785943 1.0
51478 -2.911814 3.636362 1.0
51479 0.982980 6.055359 1.0

51480 rows × 3 columns

In [74]:
X=opt_data.drop(columns=['Target'])
In [75]:
sns.scatterplot(opt_data.FirstComponent,opt_data.SecondComponent,hue=opt_data.Target,palette='plasma')
plt.grid(True)
In [78]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
In [87]:
y=opt_data.Target
In [88]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)
In [485]:
c_list=np.arange(0.10,1.1,0.10)
k=0
FIN_SCORE=[]
for c in c_list:
    clf=SVC(C=c, kernel='rbf')
    clf.fit(X_train,y_train)
    fin_score=clf.score(X_test,y_test)
    FIN_SCORE.append(fin_score)
    
    if k==1:
        print('20% of the C values inspected')
    if k==4:
        print('50% of the C values inspected')
    if k==7:
        print ('80% of the C values inspected')
    if k == 9:
        print ('100% of the C values inspected \n')
        print ('Process completed')
    k=k+1
20% of the C values inspected
50% of the C values inspected
80% of the C values inspected
100% of the C values inspected 

Process completed
In [487]:
FIN_SCORE=np.array(FIN_SCORE)
In [488]:
fin_score=FIN_SCORE.max()
i=FIN_SCORE.argmax()
c_max=c_list[i]
In [500]:
print ('The best classification is done between '+ str(c_list.min()) + ' and ' + str(c_list.max()) + '\n')
print('is ' + str(c_max) +', obtaining the following accuracy: '+str(fin_score*100)+'%')
The best classification is done between 0.1 and 1.0

is 0.8, obtaining the following accuracy: 71.88228438228438%
In [501]:
c_list=np.arange(1.,11,1.)
k=0
FIN_SCORE=[]
for c in c_list:
    clf=SVC(C=c, kernel='rbf')
    clf.fit(X_train,y_train)
    fin_score=clf.score(X_test,y_test)
    FIN_SCORE.append(fin_score)
    
    if k==1:
        print('20% of the C values inspected')
    if k==4:
        print('50% of the C values inspected')
    if k==7:
        print ('80% of the C values inspected')
    if k == 9:
        print ('100% of the C values inspected \n')
        print ('Process completed')
    k=k+1
20% of the C values inspected
50% of the C values inspected
80% of the C values inspected
100% of the C values inspected 

Process completed
In [504]:
FIN_SCORE=np.array(FIN_SCORE)
fin_score=FIN_SCORE.max()
i=FIN_SCORE.argmax()
c_max=c_list[i]
print ('The best classification is done between '+ str(c_list.min()) + ' and ' + str(c_list.max()) + '\n')
print('is ' + str(c_max) +', obtaining the following accuracy: '+str(fin_score*100)+'%')
The best classification is done between 1.0 and 10.0

is 2.0, obtaining the following accuracy: 71.95998445998445%
In [506]:
c_list=np.arange(10,60,10)
k=0
FIN_SCORE=[]
for c in c_list:
    clf=SVC(C=c, kernel='rbf')
    clf.fit(X_train,y_train)
    fin_score=clf.score(X_test,y_test)
    FIN_SCORE.append(fin_score)
    if k==0:
        print('20% of the C values inspected')
    if k==1:
        print('40% of the C values inspected')
    if k==2:
        print('60% of the C values inspected')
    if k==3:
        print('80% of the C values inspected')
    if k==4:
        print('100% of the C values inspected')
    k=k+1
20% of the C values inspected
40% of the C values inspected
60% of the C values inspected
80% of the C values inspected
100% of the C values inspected
In [508]:
FIN_SCORE=np.array(FIN_SCORE)
fin_score=FIN_SCORE.max()
i=FIN_SCORE.argmax()
c_max=c_list[i]
print ('The best classification is done between '+ str(c_list.min()) + ' and ' + str(c_list.max()) + '\n')
print('is ' + str(c_max) +', obtaining the following accuracy: '+str(fin_score*100)+'%')
The best classification is done between 10 and 50

is 20, obtaining the following accuracy: 71.86285936285937%
In [510]:
c_list=np.arange(100,600,100)
k=0
FIN_SCORE=[]
for c in c_list:
    clf=SVC(C=c, kernel='rbf')
    clf.fit(X_train,y_train)
    fin_score=clf.score(X_test,y_test)
    FIN_SCORE.append(fin_score)
    if k==0:
        print('20% of the C values inspected \n')
    if k==1:
        print('40% of the C values inspected \n')
    if k==2:
        print('60% of the C values inspected \n')
    if k==3:
        print('80% of the C values inspected \n')
    if k==4:
        print('100% of the C values inspected \n')
        print ('Process completed')
    k=k+1
20% of the C values inspected 

40% of the C values inspected 

60% of the C values inspected 

80% of the C values inspected 

100% of the C values inspected 

Process completed
In [512]:
FIN_SCORE=np.array(FIN_SCORE)
fin_score=FIN_SCORE.max()
i=FIN_SCORE.argmax()
c_max=c_list[i]
print ('The best classification is done between '+ str(c_list.min()) + ' and ' + str(c_list.max()) + '\n')
print('is ' + str(c_max) +', obtaining the following accuracy: '+str(fin_score*100)+'%')
The best classification is done between 100 and 500

is 300, obtaining the following accuracy: 71.95027195027195%
In [513]:
clf=SVC(C=1000, kernel='rbf')
clf.fit(X_train,y_train)
fin_score=clf.score(X_test,y_test)
In [516]:
print('The score for C=1000 is ' +str(fin_score*100) +'%')
The score for C=1000 is 72.03768453768454%
In [517]:
clf=SVC(C=10000, kernel='rbf')
clf.fit(X_train,y_train)
fin_score=clf.score(X_test,y_test)
In [518]:
print('The score for C=10000 is ' +str(fin_score*100) +'%')
The score for C=10000 is 72.21250971250971%
In [90]:
clf=SVC(C=100000, kernel='rbf')
clf.fit(X_train,y_train)
fin_score=clf.score(X_test,y_test)
In [91]:
print('The score for C=100000 is ' +str(fin_score*100) +'% \n')
print ('SVM best score: 72.2%')
The score for C=100000 is 72.21250971250971% 

SVM best score: 72.2%
In [92]:
pred=clf.predict(X_test)
In [93]:
pred_data=pd.DataFrame()
pred_data['FirstComponent']= X_test['FirstComponent']
pred_data['SecondComponent']=X_test['SecondComponent']
pred_data['Target']=y_test
pred_data['Prediction']=pred
In [94]:
pred_data.head()
Out[94]:
FirstComponent SecondComponent Target Prediction
44808 -2.258918 2.654765 1.0 1.0
18329 -4.022375 1.007576 1.0 1.0
17644 -1.415441 -1.475861 1.0 1.0
32773 7.309769 -1.278639 1.0 -1.0
4139 0.761960 -0.158196 1.0 1.0
In [95]:
pred_data.to_csv('SVMprediction.csv')
In [541]:
plt.subplot(2,1,1)
sns.scatterplot(pred_data.FirstComponent,pred_data.SecondComponent,hue=pred_data.Target,palette='plasma')
plt.grid(True)
plt.xlabel('First Component',fontsize=20)
plt.ylabel('Second Component',fontsize=20)
plt.subplot(2,1,2)
sns.scatterplot(pred_data.FirstComponent,pred_data.SecondComponent,hue=pred_data.Prediction,palette='plasma')
plt.grid(True)
plt.xlabel('First Component',fontsize=20)
plt.ylabel('Second Component',fontsize=20)
Out[541]:
Text(0, 0.5, 'Second Component')
In [542]:
X_test
Out[542]:
FirstComponent SecondComponent
44808 -2.258918 2.654765
18329 -4.022375 1.007576
17644 -1.415441 -1.475861
32773 7.309769 -1.278639
4139 0.761960 -0.158196
... ... ...
14720 10.005540 -2.164075
15664 -5.581327 1.035751
33420 -6.213071 -0.091682
13832 6.567993 -1.032399
17202 8.252922 -0.205586

10296 rows × 2 columns

In [111]:
import matplotlib.cm as cm
xx, yy = np.meshgrid(np.linspace(-15, 20, 500),
                     np.linspace(-15, 25, 500))
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

fig = plt.figure(figsize=(16,8))
fig.patch.set_facecolor('white')
ax = fig.gca()
imshow_handle = plt.imshow(Z, interpolation='nearest',
           extent=(xx.min(), xx.max(), yy.min(), yy.max()), aspect='auto',
           origin='lower', alpha=.5, cmap='plasma')
contours = plt.contour(xx, yy, Z, levels=[0], linewidths=2,
                       linetypes='--', colors='red')
sns.scatterplot(pred_data.FirstComponent,pred_data.SecondComponent,hue=pred_data.Target,palette='plasma')
plt.xlabel('$x_1$', fontsize=14)
plt.ylabel('$x_2$', fontsize=14)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
#plt.xlim(-3, 3)
#plt.ylim(-3, 3)
plt.legend()
plt.show()
<ipython-input-111-fdedc844d781>:7: UserWarning: The following kwargs were not used by contour: 'linetypes'
  contours = plt.contour(xx, yy, Z, levels=[0], linewidths=2,

Best method 3 Features

In [26]:
opt_data=pca_data[['FirstComponent','SecondComponent','ThirdComponent']]
opt_data['Target']=data.SharpSign
In [27]:
y=opt_data.Target
In [32]:
import plotly.express as px
#df = px.data.iris()
fig = px.scatter_3d(opt_data, x='FirstComponent', y='SecondComponent', z='ThirdComponent',
              color='Target')

fig.update_traces(marker=dict(size=2))
fig.show()
In [28]:
X=opt_data.drop(columns=['Target'])
y=opt_data.Target
In [29]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
In [30]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.9, random_state=42)
In [31]:
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.5, random_state=42)
In [34]:
K_LIST=['linear', 'poly', 'rbf', 'sigmoid']
In [64]:
BEST_KERNEL=[]
k=0
for i in range(5):
    FIN_SCORE=[]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.9, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=0.5, random_state=42)
    for ker in K_LIST:
        clf=SVC(kernel=ker)
        clf.fit(X_train,y_train)
        fin_score=clf.score(X_val,y_val)
        FIN_SCORE.append(fin_score)
        k=k+1
        print(ker + ' Kernel has been explored')
    FIN_SCORE=np.array(FIN_SCORE)
    
    BEST_KERNEL.append(K_LIST[FIN_SCORE.argmax()])
    print('Cross validation ' + str(i) + ' out of 4 \n')
linear Kernel has been explored
poly Kernel has been explored
rbf Kernel has been explored
sigmoid Kernel has been explored
Cross validation 0 out of 4 

linear Kernel has been explored
poly Kernel has been explored
rbf Kernel has been explored
sigmoid Kernel has been explored
Cross validation 1 out of 4 

linear Kernel has been explored
poly Kernel has been explored
rbf Kernel has been explored
sigmoid Kernel has been explored
Cross validation 2 out of 4 

linear Kernel has been explored
poly Kernel has been explored
rbf Kernel has been explored
sigmoid Kernel has been explored
Cross validation 3 out of 4 

linear Kernel has been explored
poly Kernel has been explored
rbf Kernel has been explored
sigmoid Kernel has been explored
Cross validation 4 out of 4 

In [85]:
CV_DATA=pd.DataFrame({'CV Number':np.arange(1,6,1),'Choosen Kernel': BEST_KERNEL})
In [93]:
sns.countplot(CV_DATA['Choosen Kernel'])
Out[93]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fa7d0fc4d00>
In [39]:
best_kernel='rbf'
In [96]:
len(c_list)
Out[96]:
100
In [97]:
c_list=np.arange(0.5,50.5,0.5)
k=0
PERC=['20%','40%','60%','80%','100%']
K=[20,40,60,80,100]
BEST_C=[]
for i in range(5):
    FIN_SCORE=[]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.9, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=0.5, random_state=42)
    for c in c_list:
        k=k+1
        clf=SVC(C=c,kernel=best_kernel)
        clf.fit(X_train,y_train)
        fin_score=clf.score(X_val,y_val)
        FIN_SCORE.append(fin_score)
        #k=k+1
        if k in K:
            ind=K.index(k)
            print (PERC[ind] + ' of the C values has been explored')
    FIN_SCORE=np.array(FIN_SCORE)
    BEST_C.append(c_list[FIN_SCORE.argmax()])
    print('Cross validation ' + str(i) + ' out of 4 \n')
20% of the C values has been explored
40% of the C values has been explored
60% of the C values has been explored
80% of the C values has been explored
100% of the C values has been explored
Cross validation 0 out of 4 

Cross validation 1 out of 4 

Cross validation 2 out of 4 

Cross validation 3 out of 4 

Cross validation 4 out of 4 

In [100]:
sns.countplot(BEST_C)
plt.xlabel('Chosen C')
plt.ylabel('Count')
Out[100]:
Text(0, 0.5, 'Count')
In [101]:
FIN_SCORE=np.array(FIN_SCORE)
In [102]:
best_c=c_list[FIN_SCORE.argmax()]
In [107]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.7, random_state=42)
In [108]:
clf=SVC(kernel=best_kernel,C=best_c)
clf.fit(X_train,y_train)
fin_score=clf.score(X_test,y_test)
In [119]:
prediction=clf.predict(X_test.drop(columns=['Target']))
In [113]:
print('The final score with 3 feature is ' + str(fin_score*100) +'% ')
The final score with 3 feature is 71.66999666999668% 
In [121]:
test_data=X_test.copy()
test_data['Target']=y_test
test_data['Prediction']=prediction
In [122]:
test_data
Out[122]:
FirstComponent SecondComponent ThirdComponent Target Prediction
44808 -2.258918 2.654765 0.723507 1.0 1.0
18329 -4.022375 1.007576 0.673480 1.0 1.0
17644 -1.415441 -1.475861 0.165084 1.0 1.0
32773 7.309769 -1.278639 0.261090 1.0 -1.0
4139 0.761960 -0.158196 -0.737992 1.0 1.0
... ... ... ... ... ...
8700 -4.543068 -0.961139 0.179753 1.0 1.0
50672 4.930508 6.876130 -0.119633 1.0 1.0
3123 -5.931023 -0.592387 0.432175 1.0 -1.0
10590 -4.090944 -1.236527 -0.152963 1.0 1.0
44054 -6.672992 -1.196730 0.058784 -1.0 -1.0

36036 rows × 5 columns

In [141]:
import plotly.express as px
#fig=go.figure()
#plt.subplot(1,2,1)
fig = px.scatter_3d(test_data, x='FirstComponent', y='SecondComponent', z='ThirdComponent',
              color='Prediction')
fig.update_traces(marker=dict(size=2))

fig.show()
In [142]:
fig = px.scatter_3d(test_data, x='FirstComponent', y='SecondComponent', z='Target',
              color='Prediction')
fig.update_traces(marker=dict(size=2))

fig.show()
In [169]:
import itertools
from string import ascii_uppercase
from sklearn.metrics import confusion_matrix

y_test=test_data.Target
predic = prediction

columns = ['Negative','Non Negative']

confm = confusion_matrix(y_test, predic)
df_cm = pd.DataFrame(confm.astype(float), index=columns, columns=columns)

ax = sns.heatmap(df_cm, cmap='plasma',annot=True,fmt='g')
In [170]:
def precision(confusion):
    TP=confusion[0][0]
    TN=confusion[1][1]
    FP=confusion[0][1]
    FN=confusion[1][0]
    pres_a=TP/(TP+FN)
    pres_b=TN/(TN+FP)
    return [pres_a,pres_b]
In [172]:
def recall(confusion):
    TP=confusion[0][0]
    TN=confusion[1][1]
    FP=confusion[0][1]
    FN=confusion[1][0]
    rec_a=TP/(TP+FP)
    rec_b=TN/(TN+FN)
    return [rec_a,rec_b]
In [176]:
def statistics(confusion):
    stat=pd.DataFrame({'Negative':[precision(confusion)[0],recall(confusion)[0]],'Non Negative':[precision(confusion)[1],recall(confusion)[1]]})
    stat.index=['Precision','Recall']
    return stat
In [177]:
statistics(confm)
Out[177]:
Negative Non Negative
Precision 0.769380 0.703551
Recall 0.393128 0.924370

Best method 3 feature 3 classes

In [12]:
opt_data=pca_data[['FirstComponent','SecondComponent','ThirdComponent']]
data['SharpSign']=data.Sharp.apply(np.sign)
opt_data['Target']=data['SharpSign']
In [13]:
y=opt_data.Target
In [182]:
import plotly.express as px
#df = px.data.iris()
fig = px.scatter_3d(opt_data, x='FirstComponent', y='SecondComponent', z='ThirdComponent',
              color='Target')

fig.update_traces(marker=dict(size=2))
fig.show()
In [183]:
X=opt_data.drop(columns=['Target'])
y=opt_data.Target
In [184]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
In [185]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.9, random_state=42)
In [186]:
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.5, random_state=42)
In [187]:
K_LIST=['linear', 'poly', 'rbf', 'sigmoid']
In [188]:
BEST_KERNEL=[]
k=0
for i in range(5):
    FIN_SCORE=[]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.9, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=0.5, random_state=42)
    k=0
    for ker in K_LIST:
        clf=SVC(kernel=ker)
        clf.fit(X_train,y_train)
        fin_score=clf.score(X_val,y_val)
        FIN_SCORE.append(fin_score)
        k=k+1
        print(ker + ' Kernel has been explored')
    FIN_SCORE=np.array(FIN_SCORE)
    
    BEST_KERNEL.append(K_LIST[FIN_SCORE.argmax()])
    print('Cross validation ' + str(i) + ' out of 4 \n')
linear Kernel has been explored
poly Kernel has been explored
rbf Kernel has been explored
sigmoid Kernel has been explored
Cross validation 0 out of 4 

linear Kernel has been explored
poly Kernel has been explored
rbf Kernel has been explored
sigmoid Kernel has been explored
Cross validation 1 out of 4 

linear Kernel has been explored
poly Kernel has been explored
rbf Kernel has been explored
sigmoid Kernel has been explored
Cross validation 2 out of 4 

linear Kernel has been explored
poly Kernel has been explored
rbf Kernel has been explored
sigmoid Kernel has been explored
Cross validation 3 out of 4 

linear Kernel has been explored
poly Kernel has been explored
rbf Kernel has been explored
sigmoid Kernel has been explored
Cross validation 4 out of 4 

In [190]:
CV_DATA=pd.DataFrame({'CV Number':np.arange(1,6,1),'Choosen Kernel': BEST_KERNEL})
In [191]:
sns.countplot(CV_DATA['Choosen Kernel'])
Out[191]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fa7d378b070>
In [192]:
best_kernel='rbf'
In [194]:
c_list=np.arange(0.5,50.5,0.5)
k=0
PERC=['20%','40%','60%','80%','100%']
K=[20,40,60,80,100]
BEST_C=[]
for i in range(5):
    FIN_SCORE=[]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.9, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=0.5, random_state=42)
    k=0
    for c in c_list:
        k=k+1
        clf=SVC(C=c,kernel=best_kernel)
        clf.fit(X_train,y_train)
        fin_score=clf.score(X_val,y_val)
        FIN_SCORE.append(fin_score)
        #k=k+1
        if k in K:
            ind=K.index(k)
            print (PERC[ind] + ' of the C values has been explored')
    FIN_SCORE=np.array(FIN_SCORE)
    BEST_C.append(c_list[FIN_SCORE.argmax()])
    print('Cross validation ' + str(i) + ' out of 4 \n')
20% of the C values has been explored
40% of the C values has been explored
60% of the C values has been explored
80% of the C values has been explored
100% of the C values has been explored
Cross validation 0 out of 4 

20% of the C values has been explored
40% of the C values has been explored
60% of the C values has been explored
80% of the C values has been explored
100% of the C values has been explored
Cross validation 1 out of 4 

20% of the C values has been explored
40% of the C values has been explored
60% of the C values has been explored
80% of the C values has been explored
100% of the C values has been explored
Cross validation 2 out of 4 

20% of the C values has been explored
40% of the C values has been explored
60% of the C values has been explored
80% of the C values has been explored
100% of the C values has been explored
Cross validation 3 out of 4 

20% of the C values has been explored
40% of the C values has been explored
60% of the C values has been explored
80% of the C values has been explored
100% of the C values has been explored
Cross validation 4 out of 4 

In [195]:
sns.countplot(BEST_C)
plt.xlabel('Chosen C')
plt.ylabel('Count')
Out[195]:
Text(0, 0.5, 'Count')
In [196]:
FIN_SCORE=np.array(FIN_SCORE)
In [197]:
best_c=c_list[FIN_SCORE.argmax()]
In [14]:
best_c=17
best_kernel='rbf'
In [199]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.7, random_state=42)
In [200]:
clf=SVC(kernel=best_kernel,C=best_c)
clf.fit(X_train,y_train)
fin_score=clf.score(X_test,y_test)
In [203]:
prediction=clf.predict(X_test)
In [204]:
print('The final score with 3 feature is ' + str(fin_score*100) +'% ')
The final score with 3 feature is 71.57564657564657% 
In [205]:
test_data=X_test.copy()
test_data['Target']=y_test
test_data['Prediction']=prediction
In [206]:
test_data
Out[206]:
FirstComponent SecondComponent ThirdComponent Target Prediction
44808 -2.258918 2.654765 0.723507 1.0 1.0
18329 -4.022375 1.007576 0.673480 1.0 1.0
17644 -1.415441 -1.475861 0.165084 1.0 1.0
32773 7.309769 -1.278639 0.261090 1.0 -1.0
4139 0.761960 -0.158196 -0.737992 1.0 1.0
... ... ... ... ... ...
8700 -4.543068 -0.961139 0.179753 1.0 1.0
50672 4.930508 6.876130 -0.119633 1.0 1.0
3123 -5.931023 -0.592387 0.432175 1.0 -1.0
10590 -4.090944 -1.236527 -0.152963 1.0 1.0
44054 -6.672992 -1.196730 0.058784 -1.0 -1.0

36036 rows × 5 columns

In [207]:
import plotly.express as px
#fig=go.figure()
#plt.subplot(1,2,1)
fig = px.scatter_3d(test_data, x='FirstComponent', y='SecondComponent', z='ThirdComponent',
              color='Prediction')
fig.update_traces(marker=dict(size=2))

fig.show()
In [208]:
fig = px.scatter_3d(test_data, x='FirstComponent', y='SecondComponent', z='Target',
              color='Prediction')
fig.update_traces(marker=dict(size=2))

fig.show()
In [214]:
import itertools
from string import ascii_uppercase
from sklearn.metrics import confusion_matrix

y_test=test_data.Target
predic = prediction

columns = ['Negative','Zero','Positive']

confm = confusion_matrix(y_test, predic)
df_cm = pd.DataFrame(confm.astype(float), index=columns, columns=columns)

ax = sns.heatmap(df_cm, cmap='plasma',annot=True,fmt='g')
In [227]:
def precision(confusion,clas):
    if clas=='Negative':
        TP=confusion[0][0]
        FN=confusion[1][0]+confusion[2][0]
        pres=TP/(TP+FN)
    if clas=='Positive':
        TP=confusion[2][2]
        FN=confusion[2][0]+confusion[2][1]
        pres=TP/(TP+FN)
    if clas=='Zero':
        TP=confusion[1][1]
        FN=confusion[1][0]+confusion[1][2]
        pres=TP/(TP+FN)
    return pres
In [235]:
def recal(confusion,clas):
    if clas=='Negative':
        TP=confusion[0][0]
        FP=confusion[0][1]+confusion[0][2]
        rec=TP/(TP+FP)
    if clas=='Positive':
        TP=confusion[2][2]
        FP=confusion[0][2]+confusion[1][2]
        rec=TP/(TP+FP)
    if clas=='Zero':
        TP=confusion[1][1]
        FP=confusion[0][1]+confusion[2][1]
        rec=TP/(TP+FP)
    return rec
In [236]:
precision(confm,'Negative'),precision(confm,'Positive'),precision(confm,'Zero')
Out[236]:
(0.7691137840453477, 0.9174284429791698, 0.9658031088082901)
In [237]:
recal(confm,'Negative'),recal(confm,'Positive'),recal(confm,'Zero')
Out[237]:
(0.3949031021509193, 0.6817625004640113, 1.0)
In [172]:
def recall(confusion):
    TP=confusion[0][0]
    TN=confusion[1][1]
    FP=confusion[0][1]
    FN=confusion[1][0]
    rec_a=TP/(TP+FP)
    rec_b=TN/(TN+FN)
    return [rec_a,rec_b]
In [ ]:
zero=[precision(confm,'Zero'),recal(confm,'Zero')
In [238]:
def statistics(confusion):
    neg=[precision(confm,'Negative'),recal(confm,'Negative')]
    pos=[precision(confm,'Positive'),recal(confm,'Positive')]
    zero=[precision(confm,'Zero'),recal(confm,'Zero')]
    stats=pd.DataFrame({'Negative':neg,'Positive':pos,'Zero':zero})
    stats.index=['Precision','Recall']
    return stats
In [239]:
statistics(confm)
Out[239]:
Negative Positive Zero
Precision 0.769114 0.917428 0.965803
Recall 0.394903 0.681763 1.000000

Best method 2 features 3 classes

In [59]:
opt_data=pca_data[['FirstComponent','SecondComponent']]
In [60]:
data['SharpSign']=data.Sharp.apply(np.sign)
opt_data['Target']=data['SharpSign']
<ipython-input-60-9563ede527cc>:2: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [14]:
sns.scatterplot(opt_data.FirstComponent,opt_data.SecondComponent,hue=opt_data.Target,palette='plasma')
plt.grid(True)
In [63]:
X=opt_data.drop(columns=['Target'])
y=opt_data.Target
In [20]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
In [62]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.9, random_state=42)
In [18]:
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.5, random_state=42)
In [19]:
K_LIST=['linear', 'poly', 'rbf', 'sigmoid']
In [20]:
BEST_KERNEL=[]
k=0
for i in range(5):
    FIN_SCORE=[]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.9, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=0.5, random_state=42)
    for ker in K_LIST:
        clf=SVC(kernel=ker)
        clf.fit(X_train,y_train)
        fin_score=clf.score(X_val,y_val)
        FIN_SCORE.append(fin_score)
        k=k+1
        print(ker + ' Kernel has been explored')
    FIN_SCORE=np.array(FIN_SCORE)
    
    BEST_KERNEL.append(K_LIST[FIN_SCORE.argmax()])
    print('Cross validation ' + str(i) + ' out of 4 \n')
linear Kernel has been explored
poly Kernel has been explored
rbf Kernel has been explored
sigmoid Kernel has been explored
Cross validation 0 out of 4 

linear Kernel has been explored
poly Kernel has been explored
rbf Kernel has been explored
sigmoid Kernel has been explored
Cross validation 1 out of 4 

linear Kernel has been explored
poly Kernel has been explored
rbf Kernel has been explored
sigmoid Kernel has been explored
Cross validation 2 out of 4 

linear Kernel has been explored
poly Kernel has been explored
rbf Kernel has been explored
sigmoid Kernel has been explored
Cross validation 3 out of 4 

linear Kernel has been explored
poly Kernel has been explored
rbf Kernel has been explored
sigmoid Kernel has been explored
Cross validation 4 out of 4 

In [21]:
sns.countplot(BEST_KERNEL)
Out[21]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fea34a0d4f0>
In [22]:
best_kernel='rbf'
In [23]:
c_list=np.arange(0.5,50.5,0.5)
k=0
PERC=['20%','40%','60%','80%','100%']
K=[20,40,60,80,100]
BEST_C=[]
for i in range(5):
    FIN_SCORE=[]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.9, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=0.5, random_state=42)
    k=0
    for c in c_list:
        k=k+1
        clf=SVC(C=c,kernel=best_kernel)
        clf.fit(X_train,y_train)
        fin_score=clf.score(X_val,y_val)
        FIN_SCORE.append(fin_score)
        #k=k+1
        if k in K:
            ind=K.index(k)
            print (PERC[ind] + ' of the C values has been explored')
    FIN_SCORE=np.array(FIN_SCORE)
    BEST_C.append(c_list[FIN_SCORE.argmax()])
    print('Cross validation ' + str(i) + ' out of 4 \n')
20% of the C values has been explored
40% of the C values has been explored
60% of the C values has been explored
80% of the C values has been explored
100% of the C values has been explored
Cross validation 0 out of 4 

20% of the C values has been explored
40% of the C values has been explored
60% of the C values has been explored
80% of the C values has been explored
100% of the C values has been explored
Cross validation 1 out of 4 

20% of the C values has been explored
40% of the C values has been explored
60% of the C values has been explored
80% of the C values has been explored
100% of the C values has been explored
Cross validation 2 out of 4 

20% of the C values has been explored
40% of the C values has been explored
60% of the C values has been explored
80% of the C values has been explored
100% of the C values has been explored
Cross validation 3 out of 4 

20% of the C values has been explored
40% of the C values has been explored
60% of the C values has been explored
80% of the C values has been explored
100% of the C values has been explored
Cross validation 4 out of 4 

In [24]:
sns.countplot(BEST_C)
plt.xlabel('Chosen C')
plt.ylabel('Count')
Out[24]:
Text(0, 0.5, 'Count')
In [25]:
FIN_SCORE=np.array(FIN_SCORE)
In [15]:
best_c=14.5
best_kernel='rbf'
In [64]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.7, random_state=42)
In [65]:
clf=SVC(kernel=best_kernel,C=best_c)
clf.fit(X_train,y_train)
fin_score=clf.score(X_test,y_test)
In [66]:
prediction=clf.predict(X_test)
In [67]:
print('The final score with 2 feature is ' + str(fin_score*100) +'% ')
The final score with 2 feature is 71.05672105672106% 
In [68]:
pred_data=X_test.copy()
pred_data['Target']=y_test
pred_data['Prediction']=prediction
In [34]:
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
def make_meshgrid(x, y, h=.4):
    x_min, x_max = x.min() - 1, x.max() + 1
    y_min, y_max = y.min() - 1, y.max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    return xx, yy

def plot_contours(ax, clf, xx, yy, **params):
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    out = ax.contourf(xx, yy, Z, **params)
    return out
In [32]:
y=y_test
In [35]:
fig, ax = plt.subplots()
# title for the plots
title = ('Decision surface of linear SVC ')
# Set-up grid for plotting.
X0, X1 = X_test['FirstComponent'], X_test['SecondComponent']
xx, yy = make_meshgrid(X0, X1)

plot_contours(ax, clf, xx, yy, cmap='plasma', alpha=0.8)
ax.scatter(X0, X1, c=y, cmap='plasma', s=20, edgecolors='k')
ax.set_ylabel('Second Component')
ax.set_xlabel('First Component')


violet_patch = mpatches.Patch(color='navy', label='Sharp<0')
yellow_patch = mpatches.Patch(color='gold', label='Sharp>0')
pink_patch = mpatches.Patch(color='magenta', label='Sharp=0')

plt.legend(handles=[violet_patch,yellow_patch,pink_patch])

ax.set_xticks(())
ax.set_yticks(())
ax.set_title('Decision Surface', fontsize=20)
#ax.legend()
plt.show()
In [36]:
test_data=pred_data
In [37]:
import itertools
from string import ascii_uppercase
from sklearn.metrics import confusion_matrix

y_test=test_data.Target
predic = prediction

columns = ['Negative','Zero','Positive']

confm = confusion_matrix(y_test, predic)
df_cm = pd.DataFrame(confm.astype(float), index=columns, columns=columns)

ax = sns.heatmap(df_cm, cmap='plasma',annot=True,fmt='g')
In [103]:
def precision(confusion,clas):
    if clas=='Negative':
        TP=confusion[0][0]
        FN=confusion[1][0]+confusion[2][0]
        pres=TP/(TP+FN)
    if clas=='Positive':
        TP=confusion[2][2]
        FN=confusion[2][0]+confusion[2][1]
        pres=TP/(TP+FN)
    if clas=='Zero':
        TP=confusion[1][1]
        FN=confusion[1][0]+confusion[1][2]
        pres=TP/(TP+FN)
    return pres
In [104]:
def recal(confusion,clas):
    if clas=='Negative':
        TP=confusion[0][0]
        FP=confusion[0][1]+confusion[0][2]
        rec=TP/(TP+FP)
    if clas=='Positive':
        TP=confusion[2][2]
        FP=confusion[0][2]+confusion[1][2]
        rec=TP/(TP+FP)
    if clas=='Zero':
        TP=confusion[1][1]
        FP=confusion[0][1]+confusion[2][1]
        rec=TP/(TP+FP)
    return rec
In [41]:
precision(confm,'Negative'),precision(confm,'Positive'),precision(confm,'Zero')
Out[41]:
(0.7746332790042969, 0.9248214196513312, 0.9658031088082901)
In [42]:
recal(confm,'Negative'),recal(confm,'Positive'),recal(confm,'Zero')
Out[42]:
(0.3711223113508909, 0.6751267184480181, 1.0)
In [43]:
def recall(confusion):
    TP=confusion[0][0]
    TN=confusion[1][1]
    FP=confusion[0][1]
    FN=confusion[1][0]
    rec_a=TP/(TP+FP)
    rec_b=TN/(TN+FN)
    return [rec_a,rec_b]
In [44]:
def statistics(confusion):
    neg=[precision(confm,'Negative'),recal(confm,'Negative')]
    pos=[precision(confm,'Positive'),recal(confm,'Positive')]
    zero=[precision(confm,'Zero'),recal(confm,'Zero')]
    stats=pd.DataFrame({'Negative':neg,'Positive':pos,'Zero':zero})
    stats.index=['Precision','Recall']
    return stats
In [45]:
statistics(confm)
Out[45]:
Negative Positive Zero
Precision 0.774633 0.924821 0.965803
Recall 0.371122 0.675127 1.000000

Best Method

In [70]:
wrong_data=pred_data[pred_data['Prediction']==1.0].drop(columns=['Target'])
wrong_target=pred_data[pred_data['Prediction']==1.0].Target
In [71]:
sns.scatterplot(wrong_data.FirstComponent,wrong_data.SecondComponent,hue=wrong_target)
Out[71]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fce608a3a60>
In [72]:
X=data.loc[X_test.index].drop(columns=['#ID','Sharp','SharpSign'])
y=data.loc[X_test.index].SharpSign
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)
In [73]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

 

#Create a Gaussian Classifier
rf=RandomForestClassifier(n_estimators=500, min_samples_split = 20, max_features = 5)

 

# Train the model on training data
rf.fit(X_train, y_train)
Out[73]:
RandomForestClassifier(max_features=5, min_samples_split=20, n_estimators=500)
In [74]:
feature_names = ['X','Y','F606W','error','F814W','error.1','Chi']
# Creating a bar plot
feature_imp = pd.Series(rf.feature_importances_,index=feature_names).sort_values(ascending=False)
sns.barplot(x=feature_imp, y=feature_imp.index)
# Add labels to your graph
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
plt.legend()
plt.show()
No handles with labels found to put in legend.
In [75]:
Results=X_test.copy()
Results['Target']=y_test
Results['Pred']=rf.predict(X_test)
Results['FirstComponent']=opt_data['FirstComponent'].loc[Results.index]
Results['SecondComponent']=opt_data['SecondComponent'].loc[Results.index]
In [76]:
good_ones=pred_data[(pred_data.Prediction==0.) |(pred_data.Prediction==-1.0)]
In [77]:
import plotly.express as px

fig = px.scatter_3d(Results, x='error', y='Y', z='Chi',
              color='Pred')
fig.update_traces(marker=dict(size=2))

fig.show()
In [79]:
for feat in feature_names:
    good_ones[feat]=data[feat].loc[good_ones.index]
<ipython-input-79-c452d75d6622>:2: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [80]:
good_ones=good_ones.rename(columns={'Prediction':'Pred'})
In [81]:
Results=Results.append(good_ones)
In [82]:
from sklearn.metrics import accuracy_score
acc=accuracy_score(Results.Pred, Results.Target)
In [84]:
import itertools
from string import ascii_uppercase
from sklearn.metrics import confusion_matrix

y_test=Results.Target
predic = Results.Pred

columns = ['Negative','Zero','Positive']

confm = confusion_matrix(y_test, predic)
df_cm = pd.DataFrame(confm.astype(float), index=columns, columns=columns)

ax = sns.heatmap(df_cm, cmap='plasma',annot=True,fmt='g')
In [105]:
def statistics(confusion):
    neg=[precision(confm,'Negative'),recal(confm,'Negative')]
    pos=[precision(confm,'Positive'),recal(confm,'Positive')]
    zero=[precision(confm,'Zero'),recal(confm,'Zero')]
    stats=pd.DataFrame({'Negative':neg,'Positive':pos,'Zero':zero})
    stats.index=['Precision','Recall']
    return stats
In [106]:
statistics(confm)
Out[106]:
Negative Positive Zero
Precision 0.776127 0.627214 0.989723
Recall 0.895888 0.804403 1.000000
In [94]:
Tot_res=pd.DataFrame({'Performance':[71,74,80,82]})
Tot_res.index=['SVM','Decision Tree','Random Forest','Ensemble Learning']
Tot_res
Out[94]:
Performance
SVM 71
Decision Tree 74
Random Forest 80
Ensemble Learning 82
ERROR! Session/line number was not unique in database. History logging moved to new session 1008
In [100]:
sns.barplot(x=Tot_res.index,y=Tot_res.Performance,palette='plasma')
plt.xlabel('Method',fontsize=20)
plt.ylabel('Accuracy (%)',fontsize=20)
plt.grid(True)